library(here)
## here() starts at C:/Users/batzdova/Desktop/EC-Web-Scrapping-and-Text-Mining
library(tidyverse)
## Warning: Paket 'tidyverse' wurde unter R Version 4.1.3 erstellt
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0      v purrr   1.0.1 
## v tibble  3.1.8      v dplyr   1.0.10
## v tidyr   1.2.1      v stringr 1.5.0 
## v readr   2.1.3      v forcats 0.5.2
## Warning: Paket 'ggplot2' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'tibble' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'tidyr' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'readr' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'purrr' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'dplyr' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'stringr' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'forcats' wurde unter R Version 4.1.3 erstellt
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
here()
## [1] "C:/Users/batzdova/Desktop/EC-Web-Scrapping-and-Text-Mining"
eda <- readRDS("three_submission.rds")  

Number of submissions per organisation type for three consultation rounds

eda  %>% 
    filter(size != "Missing") %>% 
  count(type,consult_round, size, sort = TRUE)  %>% 
  mutate(size = fct_reorder(size, n)) %>% 
  ggplot(aes(x = type, y = n, fill = consult_round))+
  geom_boxplot() +
  coord_flip()

library(see)
eda  %>% 
     filter(size != "Missing") %>% 
     count(type, time, country, consult_round, sort = TRUE)  %>% 
     mutate(country= fct_reorder(country, n)) %>% 
     ggplot(aes(x = time, y = n, fill = consult_round))+ 
  geom_violindot(fill_dots = "black") + 
  theme_modern()+ 
  scale_fill_material_d()+
  coord_flip()

During the three submission rounds which countries and size of organisations submitted most often

eda %>% 
  group_by(size, country, consult_round) %>%
  filter(size != "Missing") %>% 
mutate(count = n()) %>% 
   ggplot(aes(x = country, y = count, colour = size , group = country )) +
  geom_count() +
  coord_flip()+
 # geom_line() +
  facet_wrap(~consult_round) +
  theme_bw() +
  labs(y = "submission count", x = "Organizatin size")+
  theme (legend.position = "right")

 # ggsave("featuregraph2.png",b,  width = 9, height = 6, units = "in")
eda %>% 
     group_by(type) %>%
  filter(country != "Missing", size != "Missing") %>% 
     mutate(count = n()) %>% 
    ggplot(aes(x = type, y = count, colour = size , group = size )) +
  coord_flip()+
     geom_count() +
     # geom_line() +
    facet_wrap(~ country) +
     theme_bw() +
    labs(y = "submission count", x = "Type of Organisation")

#+
    # theme (legend.position = "none")

  #ggsave("featuregraph.png",c,  width = 9, height = 6, units = "in")
eda %>% 
  count(origin = fct_lump(country, n = 5 ) )
library(tidytext)
## Warning: Paket 'tidytext' wurde unter R Version 4.1.3 erstellt
eda %>%
  count(country = fct_lump(country , 9),type) %>%
  filter(country != "Other", country != "Missing") %>% 
#  mutate(type= reorder_within(type, n, country)) %>% #item we want to reorder, what to reorder by, the groups we want to reorder within
  mutate(type = fct_reorder(type,n)) %>% 
    ggplot(aes(x = n, y = country)) + 
  geom_col() +
  scale_y_reordered() + 
  facet_wrap(vars(type))+
  labs(y = "Countries",
         x = "Number of submissions",
         title = "Which organisations from which countries submitted most often?")

eda %>%
  count(country = fct_lump(country , 9),type) %>%
  filter(country != "Other", country != "Missing", type != "Other") %>% 
#  mutate(type= reorder_within(type, n, country)) %>% #item we want to reorder, what to reorder by, the groups we want to reorder within
  mutate(type = fct_reorder(type,n)) %>% 
  ggplot(aes(x = n, y = type)) + 
  geom_col() +
  scale_y_reordered() + 
  facet_wrap(vars(country))+
  labs(y = "Interest Groups",
         x = "Number of submissions",
         title = "Which organisations from which countries submitted most often?")

eda %>%
  filter(!is.na(country)) %>%
  count(initiatives = fct_lump(country, 6)) %>%
  mutate(initiatives = fct_reorder(initiatives, n)) %>%
  ggplot(aes(x = n, y = initiatives)) + 
  geom_col()

#library(codebook)

#codebook_data <- detect_missing(submission,
#    only_labelled = TRUE, # only labelled values are autodetected as
                                   # missing
#    negative_values_are_missing = FALSE, # negative values are NOT missing values
#    ninety_nine_problems = TRUE,   # 99/999 are missing values, if they
                                   # are more than 5 MAD from the median
#    )

#codebook(codebook_data)

#library(dataMaid)
#makeCodebook(submission, reportTitle = "ec crawling")
summary(table(eda$type, eda$size)) # chi-square test
## Number of cases in table: 1065 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = NaN, df = 44, p-value = NA
##  Chi-squared approximation may be incorrect
token <- eda %>%
  select(id, text, time, type, org, size,
        country, consult_round) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")
token %>%
  group_by(word, type, country) %>% 
  count(word, sort = TRUE) %>%
  filter( n > 200 ) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, group = type)) +
  geom_col() +
  labs(y = NULL)

#token %>%
#  count(word, sort = TRUE) %>%
 # filter(word!= "de", word != "nand", word != "nthe", word!= "article", word != "human") %>% 
 # mutate(word = reorder(word, n)) %>%
 # ggplot(aes(n, word)) +
 # geom_col() +
  #labs(y = NULL)
library(textdata)
## Warning: Paket 'textdata' wurde unter R Version 4.1.3 erstellt
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

token %>%
    inner_join(nrc_joy) %>%
    count(word, sort = TRUE)
## Joining, by = "word"
library(tidyr)

#token$id <- readr::parse_number(token$id) 

sentiment <- token %>%
  inner_join(get_sentiments("bing")) %>%
  count(type, sentiment, consult_round, country) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative) %>% 
  filter(type != "Missing")
## Joining, by = "word"
# index = id %/% 80,

Bing Sentiment-Scores for each Country

library(ggplot2)

ggplot(sentiment, aes(consult_round, sentiment, fill =consult_round)) +
  geom_boxplot(show.legend = FALSE) +
  coord_flip() +
    facet_wrap(vars(country))+
  labs(x = "Organization Types",
       y = "Bing Sentiment Scores")

  #ggsave("featuregraph.png",c,  width = 9, height = 6, units = "in")
bing_word_counts <- token %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

 #ggsave("featuregraph.png",c,  width = 9, height = 6, units = "in")
library(wordcloud)
## Lade nötiges Paket: RColorBrewer
## Warning: Paket 'RColorBrewer' wurde unter R Version 4.1.3 erstellt
 token %>%
    anti_join(stop_words) %>%
     count(word) %>%
    filter(word!= "de", word != "nand", word != "nthe", word!= "article", word != "human",word != "nai",
           word != "la", word != "des", word != "nof", word != "na", word != "nto", word != "en", 
           word != "xa0", word != "und", word != "nthis", word != "annex4", word != "ánational", 
           word != "annex", word != "3", word != "1", word != "2021", word != "e.g.", word != "nfor") %>% 
    with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

library(reshape2)
## 
## Attache Paket: 'reshape2'
## Das folgende Objekt ist maskiert 'package:tidyr':
## 
##     smiths
token %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 95)
## Joining, by = "word"

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- token %>%
  group_by(type, country, consult_round) %>%
  summarize(words = n())
## `summarise()` has grouped output by 'type', 'country'. You can override using
## the `.groups` argument.
token %>%
  semi_join(bingnegative) %>%
  group_by(type, country) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("type", "country")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(type != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
token %>%
  semi_join(bingnegative) %>%
  group_by(type, country) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("type", "country")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(type != "Missing") %>%
  slice_max(ratio, n = 1) %>% 
  ungroup() %>% 

 ggplot(aes(ratio, type)) +
geom_col(show.legend = TRUE) 
## Joining, by = "word"
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.

token %>%
  semi_join(bingnegative) %>%
  group_by(type, country) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("type", "country")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(type != "Missing") %>%
  slice_max(ratio, n = 1) %>% 
  ungroup() %>% 
 ggplot(aes(ratio, country, group = type, color = consult_round)) +
geom_point(show.legend = TRUE) 
## Joining, by = "word"
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.

ec_bigrams <- eda %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram))

ec_bigrams %>% count(bigram, sort=TRUE)
library(tidyr)

bigrams_separated <- ec_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts
bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigrams_united
bigram_tf_idf <- bigrams_united %>%
  count(country, bigram) %>%
  bind_tf_idf(bigram, country, n) %>%
  arrange(desc(tf_idf))
## Warning: A value for tf_idf is negative:
##  Input should have exactly one row per document-term combination.
library(forcats)
library(ggforce)
## Warning: Paket 'ggforce' wurde unter R Version 4.1.3 erstellt
#source:https://www.tidytextmining.com/tfidf.html

bigram_tf_idf %>%
  group_by(country) %>%
  filter(country == "United Kingdom") %>% # I filtered for 1 country, remove this if all countries wanted
  filter (bigram != "13 november", bigram != "accessed 13") %>% 
  slice_max(tf_idf, n = 9) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(bigram, tf_idf), fill = country)) +
  geom_col(show.legend = FALSE) +
 # facet_wrap_paginate(~ country, ncol = 2, nrow = 2, scales = "free", page = 7) +
  labs(x = "tf-idf", y = NULL)

bigram_tf_idf %>%
  group_by(country) %>%
  filter(country == "United States") %>% # I filtered for 1 country, remove this if all countries wanted
filter (bigram != "ieee global", bigram != "systems nlaw", bigram != "4.0 united", bigram != "noncommercial 4.0", bigram !="nfurther resources") %>% 
  slice_max(tf_idf, n = 9) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(bigram, tf_idf), fill = country)) +
  geom_col(show.legend = FALSE) +
 # facet_wrap_paginate(~ country, ncol = 2, nrow = 2, scales = "free", page = 7) +
  labs(x = "tf-idf", y = NULL)

bigram_tf_idf %>%
  group_by(country) %>%
  filter(country == "Ireland") %>% #I filtered for 1 country, remove this if all countries wanted
filter (bigram != "ieee global", bigram != "systems nlaw", bigram != "4.0 united", bigram != "noncommercial 4.0", bigram !="nfurther resources") %>% 
  slice_max(tf_idf, n = 9) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(bigram, tf_idf), fill = country)) +
  geom_col(show.legend = FALSE) +
 # facet_wrap_paginate(~ country, ncol = 2, nrow = 2, scales = "free", page = 7) 
  labs(x = "tf-idf", y = NULL)

 #ggsave("featuregraph.png",c,  width = 9, height = 6, units = "in")
library(widyr)
 words_by_type <- token %>%
    count(consult_round, word, sort = TRUE) %>%
    ungroup()


 ec_cors <- words_by_type %>% 
    pairwise_cor(consult_round, word, n, sort = TRUE)
 

 
library(ggraph)
library(igraph)
## 
## Attache Paket: 'igraph'
## Die folgenden Objekte sind maskiert von 'package:dplyr':
## 
##     as_data_frame, groups, union
## Die folgenden Objekte sind maskiert von 'package:purrr':
## 
##     compose, simplify
## Das folgende Objekt ist maskiert 'package:tidyr':
## 
##     crossing
## Das folgende Objekt ist maskiert 'package:tibble':
## 
##     as_data_frame
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     decompose, spectrum
## Das folgende Objekt ist maskiert 'package:base':
## 
##     union
set.seed(2017)

ec_cors %>%
  filter(correlation > .8) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(alpha = correlation, width = correlation)) +
  geom_node_point(size = 3, color = "lightblue") +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

library(igraph)
bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()

library(ggraph)
set.seed(2017)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

set.seed(2020)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = TRUE,
                 arrow = a, end_cap = circle(.07, 'inches'),edge_colour = "tomato") +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

library(textfeatures) #extract features
library(skimr) #summary statistics
library(data.table)

eda$ec_features <- textfeatures(eda$text, word_dims = 0,  sentiment = TRUE, normalize = FALSE) #word2vec_dims
## <U+21AA> Counting features in text...
## <U+21AA> Sentiment analysis...
## <U+21AA> Parts of speech...
## <U+21AA> Word dimensions started
## <U+2714> Job's done!
eda <- as.data.table (eda) %>% 
  as_tibble(eda)


min_var <- function(x, min = 1) {
  is_num <- vapply(x, is.numeric, logical(1))
  non_num <- names(x)[!is_num]
  yminvar <- names(x[is_num])[vapply(x[is_num], function(.x) stats::var(.x, 
      na.rm = TRUE) >= min, logical(1))]
  x[c(non_num, yminvar)]
}
eda %>% 
  select(starts_with("ec_features")) %>% 
min_var() %>% 
  skim()
Data summary
Name Piped data
Number of rows 1659
Number of columns 26
_______________________
Column type frequency:
numeric 26
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ec_features.n_urls 900 0.46 5.68 18.15 0.00 0.00 0.00 3.00 268.00 ▇▁▁▁▁
ec_features.n_mentions 900 0.46 0.90 2.15 0.00 0.00 0.00 1.00 28.00 ▇▁▁▁▁
ec_features.n_chars 900 0.46 22761.41 37917.82 32.00 7095.00 13142.00 24291.50 685800.00 ▇▁▁▁▁
ec_features.n_uq_chars 900 0.46 73.87 10.31 13.00 69.00 74.00 79.00 126.00 ▁▁▇▂▁
ec_features.n_commas 900 0.46 216.11 471.20 0.00 52.50 108.00 212.00 9646.00 ▇▁▁▁▁
ec_features.n_digits 900 0.46 302.26 837.24 0.00 48.00 102.00 234.00 10810.00 ▇▁▁▁▁
ec_features.n_exclaims 900 0.46 0.18 2.02 0.00 0.00 0.00 0.00 50.00 ▇▁▁▁▁
ec_features.n_extraspaces 900 0.46 1186.66 3816.07 0.00 99.50 309.00 873.00 51571.00 ▇▁▁▁▁
ec_features.n_lowers 900 0.46 20445.77 33248.99 5.00 6387.00 12135.00 22107.00 593435.00 ▇▁▁▁▁
ec_features.n_periods 900 0.46 272.69 705.23 0.00 51.00 99.00 201.50 9837.00 ▇▁▁▁▁
ec_features.n_words 900 0.46 3903.64 6361.28 4.00 1198.00 2264.00 4274.00 111074.00 ▇▁▁▁▁
ec_features.n_uq_words 900 0.46 1250.20 1330.38 4.00 566.00 914.00 1466.00 20268.00 ▇▁▁▁▁
ec_features.n_caps 900 0.46 935.01 2099.14 1.00 243.00 469.00 888.50 37760.00 ▇▁▁▁▁
ec_features.n_nonasciis 900 0.46 412.03 1854.77 0.00 24.00 78.00 246.00 35260.00 ▇▁▁▁▁
ec_features.n_puncts 900 0.46 481.20 1204.25 5.00 93.00 220.00 476.50 22904.00 ▇▁▁▁▁
ec_features.n_charsperword 900 0.46 6.45 11.78 3.70 5.51 5.67 5.85 291.98 ▇▁▁▁▁
ec_features.sent_afinn 0 1.00 23.42 155.47 -1389.00 0.00 0.00 32.00 4646.00 ▁▇▁▁▁
ec_features.sent_bing 0 1.00 15.81 106.38 -452.00 0.00 0.00 19.00 3630.00 ▇▁▁▁▁
ec_features.sent_syuzhet 0 1.00 49.26 181.03 -330.80 0.00 0.00 60.98 6033.25 ▇▁▁▁▁
ec_features.sent_vader 0 1.00 68.89 245.09 -1230.40 0.00 0.00 91.20 7558.90 ▇▁▁▁▁
ec_features.n_polite 0 1.00 -0.77 1.70 -9.29 -1.22 0.00 0.00 4.24 ▁▁▂▇▁
ec_features.n_first_personp 0 1.00 1.16 1.56 0.00 0.00 0.00 3.00 5.00 ▇▁▂▂▁
ec_features.n_second_personp 0 1.00 0.88 1.20 0.00 0.00 0.00 2.00 5.00 ▇▃▁▁▁
ec_features.n_third_person 0 1.00 1.61 2.11 0.00 0.00 0.00 4.00 6.00 ▇▁▁▁▂
ec_features.n_tobe 0 1.00 2.64 3.34 0.00 0.00 0.00 6.00 10.00 ▇▁▂▁▁
ec_features.n_prepositions 0 1.00 10.31 13.39 0.00 0.00 0.00 23.00 49.00 ▇▁▂▂▁
eda %>% 
  ggplot(aes(ec_features.sent_vader,country, color = country)) + 
  geom_boxplot(show.legend = FALSE)+ 
  facet_wrap(~ consult_round)

a <-eda %>% 
  filter(type != "Missing") %>% 
  filter(!ec_features.sent_vader > 3000) %>% 
    ggplot(aes(type, ec_features.sent_vader, color = type)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "lightgray") + 
  geom_boxplot(alpha = 0.8, show.legend = FALSE) +
  theme(text = element_text(size=8))+
  coord_flip()+
    ylim(-200, 600)
b <- eda %>% 
    filter(type != "Missing") %>% 
  filter(!ec_features.sent_afinn > 3000) %>% 
 ggplot(aes(type, ec_features.sent_afinn, color= type)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray") + 
  geom_boxplot(alpha = 0.8, show.legend = FALSE) +
    theme(text = element_text(size=8))+
  coord_flip() +
   ylim(-200, 600)
c <- eda %>% 
    filter(type != "Missing") %>% 
  filter(!ec_features.sent_syuzhet > 3000) %>% 
 ggplot(aes(type, ec_features.sent_syuzhet , color= type)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray") + 
  geom_boxplot(alpha = 0.8, show.legend = FALSE) +
    theme(text = element_text(size=8))+
  coord_flip() +
   ylim(-200, 600)
d <- eda %>% 
    filter(type != "Missing") %>% 
  filter(!ec_features.sent_bing > 3000) %>% 
 ggplot(aes(type, ec_features.sent_bing , color= type)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray") + 
  geom_boxplot(alpha = 0.8, show.legend = FALSE) +
    theme(text = element_text(size=8))+
  coord_flip() +
   ylim(-200, 600)
library(ggpubr)
## Warning: Paket 'ggpubr' wurde unter R Version 4.1.3 erstellt
theme_set(theme_pubr())


  ggarrange(a,b,c,d ,
            labels = c("vader", "afinn", "syuzhet", "bing"),
            ncol = 2, nrow = 2)
## Warning: Removed 34 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 25 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 11 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 10 rows containing non-finite values (`stat_boxplot()`).

#ggsave("featuregraph.png",figure,  width = 9, height = 6, units = "in")
z <-eda %>% 
  filter(country != "Missing") %>% 
  filter(!ec_features.sent_vader > 3000) %>% 
    ggplot(aes(country, ec_features.sent_vader, color = country)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "lightgray") + 
  geom_boxplot(alpha = 0.8, show.legend = FALSE) +
  geom_jitter(width = 0.15, alpha = .2) +
    theme(text = element_text(size=8),legend.position="none") +
  coord_flip()+
    ylim(-200, 600)

y <- eda %>% 
    filter(country != "Missing") %>% 
  filter(!ec_features.sent_afinn > 3000) %>% 
 ggplot(aes(country, ec_features.sent_afinn, color= country)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray") + 
  geom_boxplot(alpha = 0.8, show.legend = FALSE) +
   geom_jitter(width = 0.15, alpha = .2) +
    theme(text = element_text(size=8),legend.position="none") +
  coord_flip() +
   ylim(-200, 600)

x <- eda %>% 
    filter(type != "Missing") %>% 
  filter(!ec_features.sent_syuzhet > 3000) %>% 
 ggplot(aes(country, ec_features.sent_syuzhet , color= country)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray") + 
  geom_boxplot(alpha = 0.8, show.legend = FALSE) +
    geom_jitter(width = 0.15, alpha = .2) +
    theme(text = element_text(size=8),legend.position="none") +
  coord_flip() +
   ylim(-200, 600)


w <- eda%>% 
    filter(country != "Missing") %>% 
  filter(!ec_features.sent_bing > 3000) %>% 
 ggplot(aes(country, ec_features.sent_bing , color= country)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray") + 
  geom_boxplot(alpha = 0.8, show.legend = FALSE) +
  geom_jitter(width=0.15, alpha = .2) +
    theme(text = element_text(size = 8), legend.position="none")+
  coord_flip() +
   ylim(-200, 600)



  ggarrange(z,y,x,w ,
           # labels = c("vader", "afinn", "syuzhet", "bing"),
            ncol = 2, nrow = 2)

library(ggforce)

 eda %>% 
  select(starts_with("ec_features"), country) %>% 
  drop_na() %>% 
  scale_count() %>%
  scale_standard() %>%
  group_by(country) %>%
  summarise_if(is.numeric, mean) %>%
  gather(var, val, -country) %>%
  arrange(-val) %>%
  mutate(var = factor(var, levels = unique(var))) %>%
  ggplot(aes(x = var, y = val, fill = country)) + 
  geom_col(width = .15, fill = "#000000bb") +
  geom_point(size = 2.5, shape = 21) + 
  theme_light()+
  facet_wrap_paginate(~ country, nrow = 2, ncol = 3, page = 1) + 
  coord_flip() + 
  theme(legend.position = "none",
    axis.text = element_text(colour = "black"),
    axis.text.x = element_text(size = rel(.7)),

   # panel.grid.major = element_line(colour = "#333333", size = rel(.05)),
   # panel.grid.minor = element_line(colour = "#333333", size = rel(.025))
   ) + 
  labs(y = NULL, x = NULL,
    title = "{textfeatures}: Extract Features from EC-Text",
    subtitle = "Features extracted from text of EC initiatives 2020-2021")

## save plot
#ggsave("featuregraph.png", featuregraph, width = 9, height = 6, units = "in")
library(quanteda)
## Warning: Paket 'quanteda' wurde unter R Version 4.1.3 erstellt
## Package version: 3.2.4
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
toks <- tokens(eda$text, remove_punct = TRUE, remove_numbers = TRUE,remove_symbol = TRUE) %>% 
  tokens_remove(pattern = stopwords("en", source = "marimo"))
## Warning: NA is replaced by empty string
kwic(toks, pattern = "threat*", valuetype = "glob", window = 3)